# -*-Python-*-

# About 11B params
#
# Some improvements on t5.1.0.11B:
#  - GEGLU-activated feedforward layers instead of RELU
#    see https://arxiv.org/abs/2002.05202
#    Note that d_ff goes down by a factor of 2/3 to compensate for the extra
#    projection.
#  - no dropout on training
#  - no sharing of embedding weights with classifier layer.
#
# Note that we scaled t5.1.1.xxl slightly differently than t5.1.0.11B
#  d_model, d_ff, num_heads all increase 4x over t5.1.1.large

include 'models/t5.1.1.base.gin'

d_model = 4096
num_layers = 24
d_ff = 10240
num_heads = 64
utils.tpu_mesh_shape.model_parallelism = 8
utils.serialize_num_microbatches.tokens_per_microbatch_per_replica=1024
